Here, we will transfer the label and UMAP coordinates from the discovery cohort to the validation cohort. We will do it one compartment at a time (e.g. gcbc), so we can validate that the transfer is robust
Specifically, we will follow these steps for every compartment:
library(Seurat)
library(scCustomize)
library(tidyverse)
library(caret)
library(class)
library(harmony)
library(here)
library(glue)
library(DT)
library(pheatmap)
source(here("scRNA-seq/bin/SLOcatoR_functions.R"))
source(here("scRNA-seq/bin/utils.R"))
source(here("scRNA-seq/bin/utils_final_clusters.R"))
color_palette <- c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D", "#5A5156", "#AA0DFE",
"#F8A19F", "#F7E1A0", "#1C8356", "#FEAF16", "#822E1C", "#C4451C",
"#1CBE4F", "#325A9B", "#F6222E", "#FE00FA", "#FBE426", "#16FF32",
"black", "#3283FE", "#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3",
"#90AD1C", "#FE00FA", "#85660D", "#3B00FB", "#822E1C", "coral2",
"#1CFFCE", "#1CBE4F", "#3283FE", "#FBE426", "#F7E1A0", "#325A9B",
"#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD", "#FEAF16", "#683B79",
"#B10DA1", "#1C7F93", "#F8A19F", "dark orange", "#FEAF16", "#FBE426",
"Brown")
# Read data, clean and include annotation
gcbc <- readRDS(here("scRNA-seq/results/R_objects/seurat_objects_revision/4.11-GCBC_seurat_object_discovery_validation_cohorts.rds"))
We proceed to update the annotation:
gcbc <- updateAnnotation(gcbc)
gcbc$annotation_20230508 <- gcbc$annotation_20220619
gcbc$annotation_20230508[is.na(gcbc$annotation_20230508)] <- "unannotated"
gcbc$annotation_20230508 <- factor(
gcbc$annotation_20230508,
levels = c(names(colors_20230508[["GCBC"]]), "unannotated")
)
Idents(gcbc) <- "annotation_20230508"
gcbc$is_reference <- ifelse(
gcbc$cohort_type == "discovery",
"reference",
"query"
)
DimPlot(
gcbc,
group.by = "annotation_20230508",
split.by = "cohort_type",
cols = colors_20230508[["GCBC"]]
)
Let us explore if there are lingering doublets:
Idents(gcbc) <- "cohort_type"
FeaturePlot(gcbc, c("CD3D", "LYZ")) &
scale_color_viridis_c(option = "magma")
FeaturePlot(gcbc[, gcbc$cohort_type == "validation"], "doublet_score_scDblFinder") &
scale_color_viridis_c(option = "magma")
Indeed, one cluster is composed of doublets between B and T cells. Let us find it and remove it:
gcbc <- FindNeighbors(gcbc, dims = 1:25, reduction = "harmony")
gcbc <- FindClusters(gcbc, resolution = 1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 93201
## Number of edges: 2614021
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8454
## Number of communities: 20
## Elapsed time: 36 seconds
DimPlot(gcbc, cols = rev(color_palette))
markers16 <- FindMarkers(gcbc, ident.1 = "16", only.pos = TRUE, logfc.threshold = 0.5)
head(markers16, 30)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## CD2 0.000000e+00 0.6856149 0.539 0.022 0.000000e+00
## TXNIP 0.000000e+00 0.9780004 0.847 0.292 0.000000e+00
## AAK1 0.000000e+00 0.5630971 0.495 0.040 0.000000e+00
## IL7R 0.000000e+00 0.9389565 0.628 0.018 0.000000e+00
## FYB1 0.000000e+00 0.9690267 0.742 0.046 0.000000e+00
## TCF7 0.000000e+00 0.5165520 0.471 0.061 0.000000e+00
## TRBC1 0.000000e+00 0.6907910 0.453 0.040 0.000000e+00
## TRBC2 0.000000e+00 0.6762832 0.643 0.190 0.000000e+00
## GIMAP7 0.000000e+00 0.8974861 0.657 0.033 0.000000e+00
## GIMAP4 0.000000e+00 0.6752352 0.521 0.027 0.000000e+00
## GIMAP5 0.000000e+00 0.5149683 0.420 0.015 0.000000e+00
## LINC00861 0.000000e+00 0.5514859 0.433 0.010 0.000000e+00
## CD3E 0.000000e+00 0.7747475 0.604 0.028 0.000000e+00
## CD3D 0.000000e+00 0.8821539 0.649 0.044 0.000000e+00
## CD3G 0.000000e+00 0.8096641 0.624 0.016 0.000000e+00
## BCL11B 0.000000e+00 0.6868253 0.567 0.014 0.000000e+00
## IL32 0.000000e+00 0.7945482 0.583 0.059 0.000000e+00
## SLFN5 0.000000e+00 0.5222730 0.502 0.085 0.000000e+00
## CCR7 0.000000e+00 0.5163333 0.443 0.041 0.000000e+00
## CD7 0.000000e+00 0.5152501 0.409 0.014 0.000000e+00
## SARAF 1.196002e-279 0.5336398 0.632 0.236 4.470415e-275
## SRGN 8.029466e-194 0.5098707 0.609 0.290 3.001254e-189
gcbc <- gcbc[, !(gcbc$seurat_clusters == "16" & gcbc$cohort_type == "validation")]
DimPlot(gcbc, cols = rev(color_palette), split.by = "cohort_type")
Reintegrate:
gcbc$type <- str_c(gcbc$cohort_type, gcbc$assay, sep = "_")
shared_hvg <- find_assay_specific_features(gcbc, assay_var = "type")
gcbc <- integrate_assays(
gcbc,
assay_var = "type",
shared_hvg = shared_hvg
)
gcbc <- RunUMAP(gcbc, dims = 1:25, reduction = "harmony")
DimPlot(
gcbc,
group.by = "annotation_20230508",
split.by = "cohort_type",
cols = c(colors_20230508[["GCBC"]], "unannotated" = "gray")
) &
theme(legend.text = element_text(size = 6))
Transfer label:
# Split training and test sets
data_sets <- split_training_and_test_sets(
gcbc,
split_var = "cohort_type",
referece_label = "discovery",
query_label = "validation",
reduction = "harmony",
n_dims = 25
)
# Transfer label
annotation_query_df <- transfer_label(
seurat_obj = gcbc,
training_set = data_sets$training_set,
test_set = data_sets$test_set,
k = 30,
response_var = "annotation_20230508"
)
# Include annotation and UMAP coords in Seurat object
gcbc$annotation_20230508[annotation_query_df$query_cells] <- annotation_query_df$annotation
Idents(gcbc) <- "annotation_20230508"
gcbc$annotation_20230508_probability <- NA
gcbc$annotation_20230508_probability[annotation_query_df$query_cells] <- annotation_query_df$annotation_prob
p1 <- FeaturePlot(
gcbc[, annotation_query_df$query_cells],
"annotation_20230508_probability"
) + scale_color_viridis_c(option = "magma")
p2 <- FeaturePlot(
gcbc[, annotation_query_df$query_cells],
"doublet_score_scDblFinder"
) + scale_color_viridis_c(option = "magma")
p1 | p2
DimPlot(
gcbc,
group.by = "annotation_20230508",
split.by = "cohort_type",
cols = colors_20230508[["GCBC"]]
)
table(gcbc$annotation_20230508_probability)
##
## 0.233333333333333 0.266666666666667 0.3 0.32258064516129 0.333333333333333 0.354838709677419 0.366666666666667 0.387096774193548 0.4 0.40625 0.419354838709677 0.433333333333333 0.451612903225806 0.466666666666667 0.483870967741935 0.5 0.516129032258065 0.533333333333333 0.548387096774194 0.566666666666667 0.580645161290323 0.6 0.612903225806452 0.633333333333333 0.645161290322581 0.666666666666667 0.67741935483871 0.7 0.709677419354839 0.733333333333333 0.741935483870968 0.766666666666667 0.774193548387097 0.78125 0.8 0.806451612903226 0.8125 0.833333333333333 0.838709677419355 0.866666666666667 0.870967741935484 0.875 0.9 0.903225806451613 0.933333333333333 0.935483870967742 0.966666666666667 0.967741935483871 0.96875 1
## 4 6 23 1 37 3 84 2 122 1 4 195 5 265 6 454 6 566 14 597 8 592 12 589 12 612 16 591 16 607 14 669 18 1 693 15 1 795 19 944 16 1 1089 29 1469 37 2229 55 2 6498
Plot dot plot
goi_rna <- c("CXCR4", "AICDA", "FOXP1", "MME", "CD83", "LMO2", "POLA1", "HIST1H4C",
"MKI67", "TOP2A", "CDC20", "CCNB1", "STMN1", "HMGB2", "CD40", "BCL2A1",
"MIR155HG", "EBI3", "TRAF1", "NFKB1", "NFKB2", "RELB", "MYC", "BATF",
"CCR6", "CELF2", "BANK1", "PRDM1", "XBP1")
avgexpr_mat <- AverageExpression(
gcbc[, gcbc$cohort_type == "validation"],
features = goi_rna,
assays = "RNA",
return.seurat = FALSE,
group.by = "annotation_20230508",
slot = "data"
)$RNA
# Define annotation column
cell_types <- levels(gcbc$annotation_20230508)
mycolors <- list(cell_type = c(colors_20230508$GCBC))
annotation_col <- data.frame(cell_type = cell_types)
rownames(annotation_col) <- cell_types
# Scale per row between 0 and 1
input_mat <- t(apply(avgexpr_mat, 1, function(x) (x - min(x)) / diff(range(x))))
# Plot heatmap
pheatmap(
t(input_mat),
annotation_row = annotation_col,
annotation_colors = mycolors,
annotation_names_row = FALSE,
annotation_legend = FALSE,
show_rownames = FALSE,
show_colnames = TRUE,
border_color = NA,
cluster_rows = FALSE,
cluster_cols = FALSE,
gaps_col = c(1, 3, 8),
fontsize_row = 6
)
saveRDS(gcbc, here("scRNA-seq/results/R_objects/seurat_objects_revision/5.11-GCBC_seurat_object_discovery_validation_cohorts_annotation.rds"))
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /home/groups/singlecell/rmassoni/anaconda3/envs/richter/lib/libopenblasp-r0.3.21.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=es_ES.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=es_ES.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=es_ES.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] pheatmap_1.0.12 DT_0.27 glue_1.6.2 here_1.0.1 harmony_0.1.1 Rcpp_1.0.10 class_7.3-21 caret_6.0-94 lattice_0.20-45 forcats_1.0.0 stringr_1.5.0 dplyr_1.1.0 purrr_1.0.1 readr_2.1.4 tidyr_1.3.0 tibble_3.1.8 ggplot2_3.4.1 tidyverse_1.3.2 scCustomize_1.1.1 SeuratObject_4.1.3 Seurat_4.3.0 BiocStyle_2.26.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 spatstat.explore_3.0-6 reticulate_1.26 tidyselect_1.2.0 htmlwidgets_1.6.1 grid_4.2.2 Rtsne_0.16 pROC_1.18.0 munsell_0.5.0 codetools_0.2-19 ica_1.0-3 future_1.31.0 miniUI_0.1.1.1 withr_2.5.0 spatstat.random_3.1-3 colorspace_2.1-0 progressr_0.13.0 highr_0.10 knitr_1.42 stats4_4.2.2 ROCR_1.0-11 tensor_1.5 listenv_0.9.0 labeling_0.4.2 polyclip_1.10-4 farver_2.1.1 rprojroot_2.0.3 parallelly_1.34.0 vctrs_0.5.2 generics_0.1.3 ipred_0.9-13 xfun_0.37 timechange_0.2.0 R6_2.5.1 ggbeeswarm_0.7.1 spatstat.utils_3.0-1 cachem_1.0.6 assertthat_0.2.1 promises_1.2.0.1 scales_1.2.1 nnet_7.3-18 googlesheets4_1.0.1 beeswarm_0.4.0 gtable_0.3.1 globals_0.16.2 goftest_1.2-3 timeDate_4022.108 rlang_1.0.6 GlobalOptions_0.1.2 splines_4.2.2 lazyeval_0.2.2
## [52] ModelMetrics_1.2.2.2 gargle_1.3.0 spatstat.geom_3.0-6 broom_1.0.3 BiocManager_1.30.19 yaml_2.3.7 reshape2_1.4.4 abind_1.4-5 modelr_0.1.10 backports_1.4.1 httpuv_1.6.9 tools_4.2.2 lava_1.7.1 bookdown_0.33 ellipsis_0.3.2 jquerylib_0.1.4 RColorBrewer_1.1-3 ggridges_0.5.4 plyr_1.8.8 rpart_4.1.19 deldir_1.0-6 pbapply_1.7-0 cowplot_1.1.1 zoo_1.8-11 haven_2.5.1 ggrepel_0.9.3 cluster_2.1.4 fs_1.6.1 magrittr_2.0.3 data.table_1.14.6 scattermore_0.8 circlize_0.4.15 lmtest_0.9-40 reprex_2.0.2 RANN_2.6.1 googledrive_2.0.0 fitdistrplus_1.1-8 matrixStats_0.63.0 hms_1.1.2 patchwork_1.1.2 mime_0.12 evaluate_0.20 xtable_1.8-4 readxl_1.4.2 gridExtra_2.3 shape_1.4.6 compiler_4.2.2 KernSmooth_2.23-20 crayon_1.5.2 htmltools_0.5.4 later_1.3.0
## [103] tzdb_0.3.0 ggprism_1.0.4 lubridate_1.9.1 DBI_1.1.3 dbplyr_2.3.0 MASS_7.3-58.2 Matrix_1.5-3 cli_3.6.0 parallel_4.2.2 gower_1.0.1 igraph_1.3.5 pkgconfig_2.0.3 sp_1.6-0 plotly_4.10.1 spatstat.sparse_3.0-0 recipes_1.0.4 xml2_1.3.3 paletteer_1.5.0 foreach_1.5.2 vipor_0.4.5 bslib_0.4.2 hardhat_1.2.0 prodlim_2019.11.13 rvest_1.0.3 snakecase_0.11.0 digest_0.6.31 sctransform_0.3.5 RcppAnnoy_0.0.20 janitor_2.2.0 spatstat.data_3.0-0 rmarkdown_2.20 cellranger_1.1.0 leiden_0.4.3 uwot_0.1.14 shiny_1.7.4 lifecycle_1.0.3 nlme_3.1-162 jsonlite_1.8.4 limma_3.54.0 viridisLite_0.4.1 fansi_1.0.4 pillar_1.8.1 ggrastr_1.0.1 fastmap_1.1.0 httr_1.4.4 survival_3.5-3 png_0.1-8 iterators_1.0.14 stringi_1.7.12 sass_0.4.5 rematch2_2.1.2
## [154] irlba_2.3.5.1 future.apply_1.10.0